Load raw data, annotate probes using biomaRt, filter probes and load SFARI genes
Filtering criteria:
Probes with no length informatoin in datProbes (5)
Samples from Subject AN03345 (2)
Probes that have no expression on at least half of the samples (30197)
Probes that are present in BrainSpan filtered dataset ()
if(!file.exists('./../Data/Gandal_RNASeq_intersect_BrainSpan.RData')){
# Load csvs
datExpr = read.csv('./../../Gandal/RNAseq/raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../../Gandal/RNAseq/raw_data/RNAseq_ASD_datMeta.csv')
SFARI_genes = read_csv('./../../Gandal/RNAseq/working_data/SFARI_genes_01-15-2019.csv')
# Make sure datExpr and datMeta columns/rows match
rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
if(!all(colnames(datExpr) == rownames(datMeta))){
print('Columns in datExpr don\'t match the rowd in datMeta!')
}
# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position
# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
#################################################################################
# FILTERS:
# 1 Filter probes with start or end position missing (filter 5)
# These can be filtered without probe info, they have weird IDS that don't start with ENS
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id
# 2. Filter samples from ID AN03345 (filter 2)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
# 3. Filter samples with rowSums <= 40 (at least half of the samples without any expression)
to_keep = rowSums(datExpr)>40
datExpr = datExpr[to_keep,]
datProbes = datProbes[to_keep,]
# 4. Filter probes that are not in the Gandal filtered dataset
to_keep = read.csv('./../Data/Gandal_BrainSpan_probes.csv', sep='')
datProbes = datProbes[rownames(datProbes) %in% to_keep$x,]
datExpr = datExpr[rownames(datExpr) %in% to_keep$x,]
#################################################################################
# DEA using DESeq2
counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
ddsSE = DESeqDataSet(se, design =~Diagnosis_)
dds = DESeq(ddsSE)
DE_info = results(dds) %>% data.frame %>% rownames_to_column(var = 'ID') %>%
mutate('logFC'=log2FoldChange, 'adj.P.Val'=padj) %>%
dplyr::select(-log2FoldChange, -padj)
#################################################################################
# Annotate SFARI genes
# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>%
mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>%
dplyr::select('ID', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
#################################################################################
# Add functional annotation to genes from GO
GO_annotations = read.csv('./../../Gandal/RNAseq/working_data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID' = as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal' = 1)
save(datExpr, datMeta, datProbes, GO_neuronal, SFARI_genes, DE_info, file='./../Data/Gandal_RNASeq_intersect_BrainSpan.RData')
rm(getinfo, to_keep, gene_names, mart, counts, dds, ddsSE, GO_annotations, rowRanges, se)
} else load('./../Data/Gandal_RNASeq_intersect_BrainSpan.RData')
## Parsed with column specification:
## cols(
## status = col_double(),
## `gene-symbol` = col_character(),
## `gene-name` = col_character(),
## chromosome = col_character(),
## `genetic-category` = col_character(),
## `gene-score` = col_double(),
## syndromic = col_double(),
## `number-of-reports` = col_double()
## )
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 276 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
datExpr_backup = datExpr
Number of genes and samples:
dim(datExpr)
## [1] 17621 86
Gene count by SFARI score of remaining genes:
table(SFARI_genes$`gene-score`)
##
## 1 2 3 4 5 6
## 29 82 209 538 191 25
Gene count by Diagnosis and Brain lobe:
t(table(datMeta$Brain_lobe, datMeta$Diagnosis_))
##
## Frontal Temporal Parietal Occipital
## ASD 8 14 14 15
## CTL 13 6 8 8
Relation between SFARI scores and Neuronal functional annotation:
There is one gene in the SFARI list that has score both 3 and 4
Higher SFARI scores have a higher percentage of genes with neuronal functional annotation (makes sense)
Neuronal_SFARI = data.frame('ID'=rownames(datExpr), 'Neuronal'=rownames(datExpr) %in% GO_neuronal$ID) %>%
left_join(SFARI_genes, by='ID')
tbl = table(Neuronal_SFARI$`gene-score`, Neuronal_SFARI$Neuronal, useNA='ifany') %>% t %>% as.data.frame.matrix
rownames(tbl) = c('Non-Neuronal','Neuronal')
tbl
## 1 2 3 4 5 6 NA
## Non-Neuronal 15 43 133 303 113 16 15988
## Neuronal 8 16 39 65 34 3 845
tbl = round(sweep(tbl, 2, colSums(tbl), `/`)*100, 2)
tbl
## 1 2 3 4 5 6 NA
## Non-Neuronal 65.22 72.88 77.33 82.34 76.87 84.21 94.98
## Neuronal 34.78 27.12 22.67 17.66 23.13 15.79 5.02
rm(Neuronal_SFARI, tbl)
make_ASD_vs_CTL_df = function(datExpr){
datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))
ASD_vs_CTL = data.frame('ID'=as.character(rownames(datExpr)),
'mean' = rowMeans(datExpr), 'sd' = apply(datExpr, 1, sd),
'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
left_join(SFARI_genes, by='ID') %>%
dplyr::select(ID, mean, sd, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
mutate('Neuronal'=ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Non-Neuronal'))
ASD_vs_CTL_SFARI = ASD_vs_CTL %>% filter(!is.na(`gene-score`)) %>%
mutate(Group = as.character(`gene-score`)) %>% dplyr::select(-c(Neuronal,`gene-score`))
ASD_vs_CTL_Neuro = ASD_vs_CTL %>% mutate(Group = Neuronal) %>% dplyr::select(-c(Neuronal,`gene-score`))
ASD_vs_CTL_all = ASD_vs_CTL %>% mutate(Group = 'All') %>% dplyr::select(-c(Neuronal,`gene-score`))
ASD_vs_CTL_together = bind_rows(ASD_vs_CTL_SFARI, ASD_vs_CTL_Neuro, ASD_vs_CTL_all) %>%
mutate(Group = ordered(Group, levels=c('1','2','3','4','5','6',
'Neuronal','Non-Neuronal','All'))) %>%
left_join(DE_info, by='ID')
return(ASD_vs_CTL_together)
}
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)
p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Mean (left) and Standard Deviation (right) by score') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Mean (left) and Standard Deviation (right) by score') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
subplot(p1, p2, nrows=1)
rm(p1, p2)
There is heterocedasticity in the data, but the variance increases at a lower rate than the mean, so the \(log_2\) transformation may affect the variance
ggplot(ASD_vs_CTL, aes(mean+1, sd+1)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +
scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2') +
ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()
The higher the SFARI score, the bigger the difference between the level of expression between Autism and Control
The higher the SFARI score, the smaller the log Fold Change between Autism and Control
p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(logFC), fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
subplot(p1, p2, nrows=1)
rm(p1, p2)
counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis_)
dds = estimateSizeFactors(dds)
vst_output = vst(dds)
datExpr_vst = assay(vst_output)
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr_vst)
p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) +
geom_boxplot() + theme_minimal() +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) +
geom_boxplot() + theme_minimal() +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
subplot(p1, p2, nrows=1)
rm(p1, p2)
ggplot(ASD_vs_CTL, aes(mean, sd)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +
ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()
The higher the SFARI score, the smaller the difference between the level of expression between Autism and Control
The second plot is the same as with the raw data, but this time it agrees with the left plot
p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(logFC), fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
subplot(p1, p2, nrows=1)
rm(p1, p2)
The higher the SFARI score:
The higher the percentage of the genes with Neuronal functional annotation
The higher the level of expression
The higher the difference between Autism and Control gene expression levels, but only in raw data (probably because of heteroscedasticity)
The lower the log Fold Change